import pandas as pd
import numpy as np
dt = pd.read_csv("heart.csv")
dt
dt.info()
dt.columns
dt.columns = ['age', 'sex', 'chest_pain_type', 'resting_blood_pressure', 'cholesterol', 'fasting_blood_sugar', 'rest_ecg', 'max_heart_rate_achieved',
'exercise_induced_angina', 'st_depression', 'st_slope', 'num_major_vessels', 'thalassemia', 'target']
dt['sex'].value_counts()
dt['chest_pain_type'].value_counts()
dt['fasting_blood_sugar'].value_counts()
dt['rest_ecg'].value_counts()
dt['exercise_induced_angina'].value_counts()
dt['st_slope'].value_counts()
dt['thalassemia'].value_counts()
dt['sex'][dt['sex'] == 0] = 'female'
dt['sex'][dt['sex'] == 1] = 'male'
dt['chest_pain_type'][dt['chest_pain_type'] == 0] = 'typical angina'
dt['chest_pain_type'][dt['chest_pain_type'] == 1] = 'atypical angina'
dt['chest_pain_type'][dt['chest_pain_type'] == 2] = 'non-anginal pain'
dt['chest_pain_type'][dt['chest_pain_type'] == 3] = 'asymptomatic'
dt['fasting_blood_sugar'][dt['fasting_blood_sugar'] == 0] = 'lower than 120mg/ml'
dt['fasting_blood_sugar'][dt['fasting_blood_sugar'] == 1] = 'greater than 120mg/ml'
dt['rest_ecg'][dt['rest_ecg'] == 0] = 'normal'
dt['rest_ecg'][dt['rest_ecg'] == 1] = 'ST-T wave abnormality'
dt['rest_ecg'][dt['rest_ecg'] == 2] = 'left ventricular hypertrophy'
dt['exercise_induced_angina'][dt['exercise_induced_angina'] == 0] = 'no'
dt['exercise_induced_angina'][dt['exercise_induced_angina'] == 1] = 'yes'
dt['st_slope'][dt['st_slope'] == 0] = 'upsloping'
dt['st_slope'][dt['st_slope'] == 1] = 'flat'
dt['st_slope'][dt['st_slope'] == 2] = 'downsloping'
dt['thalassemia'][dt['thalassemia'] == 1] = 'normal'
dt['thalassemia'][dt['thalassemia'] == 2] = 'fixed defect'
dt['thalassemia'][dt['thalassemia'] == 3] = 'reversable defect'
dt
dt['thalassemia'].value_counts()
dt.dtypes
data = dt[dt['thalassemia'] != 0]
data.info()
data.to_csv('CleanHeart.csv', index=False)
df = pd.read_csv('CleanHeart.csv')
df
!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip
from pandas_profiling import ProfileReport
profile = ProfileReport(df , title ='Pandas Profiling',explorative=True)
profile.to_widgets()
profile.to_file("heart_report.html")
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
df.columns
feature_data = df.drop(columns=['target'])
target_data = df.target
feature_data = data.drop(columns=['target'])
target_data = data.target
cat_data = feature_data.select_dtypes(include=['object'])
print(cat_data.columns)
num_data = feature_data.select_dtypes(include=['int','float'])
print(num_data.columns)
oe = OrdinalEncoder()
oe.fit(cat_data)
ss = StandardScaler()
ss.fit(num_data)
cat = pd.DataFrame(data = oe.transform(cat_data) , columns=cat_data.columns)
num = pd.DataFrame(data = ss.transform(num_data), columns= num_data.columns)
cat_pipeline = make_pipeline(OrdinalEncoder())
num_pipeline = make_pipeline(StandardScaler())
preprocessor = make_column_transformer(
(cat_pipeline,cat_data.columns),
(num_pipeline,num_data.columns)
)
from sklearn.model_selection import train_test_split
trainX, testX, trainY, testY = train_test_split(feature_data, target_data)
from sklearn.linear_model import LogisticRegression
pipeline = make_pipeline(preprocessor, LogisticRegression())
pipeline.fit( trainX , trainY)
print("Training Score : ",pipeline.score(trainX, trainY))
print('Testing Score : ',pipeline.score(testX, testY))
pipeline.fit( trainX , trainY)
print("Training Score : ",pipeline.score(trainX, trainY))
print('Testing Score : ',pipeline.score(testX, testY))
from sklearn.ensemble import RandomForestClassifier
rf_pipeline = make_pipeline( preprocessor , RandomForestClassifier( n_estimators= 100 ))
rf_pipeline.fit( trainX , trainY)
print("Training Score : ",rf_pipeline.score(trainX, trainY))
print('Testing Score : ',rf_pipeline.score(testX, testY))
rf_pipeline.fit( trainX , trainY)
print("Training Score : ",rf_pipeline.score(trainX, trainY))
print('Testing Score : ',rf_pipeline.score(testX, testY))
from sklearn.model_selection import GridSearchCV
gs_pipeline = make_pipeline(preprocessor, RandomForestClassifier(n_estimators=100))
params = {'randomforestclassifier__n_estimators':[100,200,250],'randomforestclassifier__criterion':['gini','entropy'], 'randomforestclassifier__max_depth':[5,10,15]}
gs = GridSearchCV(gs_pipeline, param_grid=params, cv=5, n_jobs=4)
gs.fit( trainX , trainY)
print("Training Score : ",gs.score(trainX, trainY))
print('Testing Score : ',gs.score(testX, testY))
print('******************************')
print('Best params :',gs.best_params_)
print('Best Score :', gs.best_score_ )
gs.fit( trainX , trainY)
print("Training Score : ",gs.score(trainX, trainY))
print('Testing Score : ',gs.score(testX, testY))
print('******************************')
print('Best params :',gs.best_params_)
print('Best Score :', gs.best_score_ )
Importing joblib library to create and save the model file to use for feture predictions.
from sklearn.externals import joblib
joblib.dump(pipeline, 'lm_88.joblib')
joblib.dump(rf_pipeline, 'rf_n100_86.pkl')
joblib.dump(gs, 'rf_n200_md5_gini_85.pkl')
testX
testX['target'] = testY
testX.to_csv('HeartTestData1.csv', index=False)
pipeline.predict(pd.DataFrame([[49, 'male', 'non-anginal pain', 109, 102, 'lower than 120mg/ml', 'ST-T wave abnormality', 138, 'no', 2.2, 'flat', 2, 'fixed defect']] , columns=['age', 'sex', 'chest_pain_type', 'resting_blood_pressure','cholesterol', 'fasting_blood_sugar', 'rest_ecg','max_heart_rate_achieved', 'exercise_induced_angina', 'st_depression','st_slope', 'num_major_vessels', 'thalassemia']))
from joblib import load
model = load('lm_88.pkl')
pred = model.predict(testX.drop(columns='target'))
dc.target.value_counts()
import pandas as pd
dc = pd.DataFrame()
dc['pred'] = pred
dc['target'] = testX['target']
dc[ dc.pred == dc.target ]
dc[ dc.pred != dc.target ]
print('Model Accuracy :( No. of right predictions / Total no. of rows in data ) * 100 ')
print('Model Accuracy : ', (66 / 75) * 100 , '%')